In [1]:
# Step-by-Step Process for Your Jupyter Notebook
# 1. Import Necessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [2]:
# 2. Load both datasets
df_vehicles = pd.read_csv('Electric_Vehicle_Population_Data.csv')
df_population_history = pd.read_csv('Electric_Vehicle_Population_Size_History_By_County.csv')

# Display first few rows to understand the structure
df_vehicles.head(), df_population_history.head()
Out[2]:
(   VIN (1-10)     County       City State  Postal Code  Model Year     Make  \
 0  5UXTA6C0XM     Kitsap    Seabeck    WA      98380.0        2021      BMW   
 1  5YJ3E1EB1J     Kitsap    Poulsbo    WA      98370.0        2018    TESLA   
 2  WP0AD2A73G  Snohomish    Bothell    WA      98012.0        2016  PORSCHE   
 3  5YJ3E1EB5J     Kitsap  Bremerton    WA      98310.0        2018    TESLA   
 4  1N4AZ1CP3K       King    Redmond    WA      98052.0        2019   NISSAN   
 
       Model                   Electric Vehicle Type  \
 0        X5  Plug-in Hybrid Electric Vehicle (PHEV)   
 1   MODEL 3          Battery Electric Vehicle (BEV)   
 2  PANAMERA  Plug-in Hybrid Electric Vehicle (PHEV)   
 3   MODEL 3          Battery Electric Vehicle (BEV)   
 4      LEAF          Battery Electric Vehicle (BEV)   
 
   Clean Alternative Fuel Vehicle (CAFV) Eligibility  Electric Range  \
 0           Clean Alternative Fuel Vehicle Eligible            30.0   
 1           Clean Alternative Fuel Vehicle Eligible           215.0   
 2             Not eligible due to low battery range            15.0   
 3           Clean Alternative Fuel Vehicle Eligible           215.0   
 4           Clean Alternative Fuel Vehicle Eligible           150.0   
 
    Base MSRP  Legislative District  DOL Vehicle ID  \
 0        0.0                  35.0       267929112   
 1        0.0                  23.0       475911439   
 2        0.0                   1.0       101971278   
 3        0.0                  23.0       474363746   
 4        0.0                  45.0       476346482   
 
                   Vehicle Location  \
 0  POINT (-122.8728334 47.5798304)   
 1  POINT (-122.6368884 47.7469547)   
 2    POINT (-122.206146 47.839957)   
 3  POINT (-122.6231895 47.5930874)   
 4      POINT (-122.13158 47.67858)   
 
                                 Electric Utility  2020 Census Tract  
 0                         PUGET SOUND ENERGY INC       5.303509e+10  
 1                         PUGET SOUND ENERGY INC       5.303509e+10  
 2                         PUGET SOUND ENERGY INC       5.306105e+10  
 3                         PUGET SOUND ENERGY INC       5.303508e+10  
 4  PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA)       5.303303e+10  ,
                Date        County State Vehicle Primary Use  \
 0      June 30 2024         Caddo    LA           Passenger   
 1  December 31 2017  Pend Oreille    WA               Truck   
 2     April 30 2017       Lincoln    WA               Truck   
 3  February 28 2022       El Paso    CO           Passenger   
 4   January 31 2017        DeKalb    GA           Passenger   
 
    Battery Electric Vehicles (BEVs)  Plug-In Hybrid Electric Vehicles (PHEVs)  \
 0                                 2                                         0   
 1                                 0                                         0   
 2                                 0                                         0   
 3                                 2                                         2   
 4                                 1                                         0   
 
    Electric Vehicle (EV) Total  Non-Electric Vehicle Total  Total Vehicles  \
 0                            2                          10              12   
 1                            0                        5619            5619   
 2                            0                        4464            4464   
 3                            4                         663             667   
 4                            1                          92              93   
 
    Percent Electric Vehicles  
 0                      16.67  
 1                       0.00  
 2                       0.00  
 3                       0.60  
 4                       1.08  )
In [3]:
# 3. Initial Data Inspection

# Check for data types, missing values, and basic information
df_vehicles.info()
df_population_history.info()

# Check for missing or NaN values in both datasets
df_vehicles.isnull().sum(), df_population_history.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210165 entries, 0 to 210164
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         210165 non-null  object 
 1   County                                             210161 non-null  object 
 2   City                                               210161 non-null  object 
 3   State                                              210165 non-null  object 
 4   Postal Code                                        210161 non-null  float64
 5   Model Year                                         210165 non-null  int64  
 6   Make                                               210165 non-null  object 
 7   Model                                              210165 non-null  object 
 8   Electric Vehicle Type                              210165 non-null  object 
 9   Clean Alternative Fuel Vehicle (CAFV) Eligibility  210165 non-null  object 
 10  Electric Range                                     210160 non-null  float64
 11  Base MSRP                                          210160 non-null  float64
 12  Legislative District                               209720 non-null  float64
 13  DOL Vehicle ID                                     210165 non-null  int64  
 14  Vehicle Location                                   210155 non-null  object 
 15  Electric Utility                                   210161 non-null  object 
 16  2020 Census Tract                                  210161 non-null  float64
dtypes: float64(5), int64(2), object(10)
memory usage: 27.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23439 entries, 0 to 23438
Data columns (total 10 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Date                                      23439 non-null  object 
 1   County                                    23346 non-null  object 
 2   State                                     23346 non-null  object 
 3   Vehicle Primary Use                       23439 non-null  object 
 4   Battery Electric Vehicles (BEVs)          23439 non-null  int64  
 5   Plug-In Hybrid Electric Vehicles (PHEVs)  23439 non-null  int64  
 6   Electric Vehicle (EV) Total               23439 non-null  int64  
 7   Non-Electric Vehicle Total                23439 non-null  int64  
 8   Total Vehicles                            23439 non-null  int64  
 9   Percent Electric Vehicles                 23439 non-null  float64
dtypes: float64(1), int64(5), object(4)
memory usage: 1.8+ MB
Out[3]:
(VIN (1-10)                                             0
 County                                                 4
 City                                                   4
 State                                                  0
 Postal Code                                            4
 Model Year                                             0
 Make                                                   0
 Model                                                  0
 Electric Vehicle Type                                  0
 Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
 Electric Range                                         5
 Base MSRP                                              5
 Legislative District                                 445
 DOL Vehicle ID                                         0
 Vehicle Location                                      10
 Electric Utility                                       4
 2020 Census Tract                                      4
 dtype: int64,
 Date                                         0
 County                                      93
 State                                       93
 Vehicle Primary Use                          0
 Battery Electric Vehicles (BEVs)             0
 Plug-In Hybrid Electric Vehicles (PHEVs)     0
 Electric Vehicle (EV) Total                  0
 Non-Electric Vehicle Total                   0
 Total Vehicles                               0
 Percent Electric Vehicles                    0
 dtype: int64)
In [4]:
# 4. Data Cleaning
# 4.1 Handling Missing Values

# Drop rows with critical missing values in 'VIN', 'Make', 'Model', etc.
df_vehicles_cleaned = df_vehicles.dropna(subset=['VIN (1-10)', 'Make', 'Model', 'County'])

# Fill or drop NaN values in 'Electric Range' and 'Base MSRP'
df_vehicles_cleaned['Electric Range'].fillna(df_vehicles_cleaned['Electric Range'].mean(), inplace=True)
df_vehicles_cleaned['Base MSRP'].fillna(df_vehicles_cleaned['Base MSRP'].mean(), inplace=True)

# Verify if missing values are handled
df_vehicles_cleaned.isnull().sum()
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_vehicles_cleaned['Electric Range'].fillna(df_vehicles_cleaned['Electric Range'].mean(), inplace=True)
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vehicles_cleaned['Electric Range'].fillna(df_vehicles_cleaned['Electric Range'].mean(), inplace=True)
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_vehicles_cleaned['Base MSRP'].fillna(df_vehicles_cleaned['Base MSRP'].mean(), inplace=True)
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vehicles_cleaned['Base MSRP'].fillna(df_vehicles_cleaned['Base MSRP'].mean(), inplace=True)
Out[4]:
VIN (1-10)                                             0
County                                                 0
City                                                   0
State                                                  0
Postal Code                                            0
Model Year                                             0
Make                                                   0
Model                                                  0
Electric Vehicle Type                                  0
Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
Electric Range                                         0
Base MSRP                                              0
Legislative District                                 441
DOL Vehicle ID                                         0
Vehicle Location                                       6
Electric Utility                                       0
2020 Census Tract                                      0
dtype: int64
In [5]:
# 4.2 Handling Outliers

# Check for outliers in 'Electric Range' and 'Base MSRP'
sns.boxplot(x=df_vehicles_cleaned['Electric Range'])
plt.show()

# Remove extreme outliers in 'Electric Range'
df_vehicles_cleaned = df_vehicles_cleaned[df_vehicles_cleaned['Electric Range'] < 500]
No description has been provided for this image
In [6]:
# 5. Create Index and Use loc/iloc

# Create a new index (use VIN or another field as the index)
df_vehicles_cleaned.set_index('VIN (1-10)', inplace=True)

# Use loc/iloc to access specific rows and columns
# Access data for a specific vehicle by VIN
df_vehicles_cleaned.loc['5YJ3E1EB1J']
# Access first 5 rows
df_vehicles_cleaned.iloc[0:5] 
Out[6]:
County City State Postal Code Model Year Make Model Electric Vehicle Type Clean Alternative Fuel Vehicle (CAFV) Eligibility Electric Range Base MSRP Legislative District DOL Vehicle ID Vehicle Location Electric Utility 2020 Census Tract
VIN (1-10)
5UXTA6C0XM Kitsap Seabeck WA 98380.0 2021 BMW X5 Plug-in Hybrid Electric Vehicle (PHEV) Clean Alternative Fuel Vehicle Eligible 30.0 0.0 35.0 267929112 POINT (-122.8728334 47.5798304) PUGET SOUND ENERGY INC 5.303509e+10
5YJ3E1EB1J Kitsap Poulsbo WA 98370.0 2018 TESLA MODEL 3 Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 215.0 0.0 23.0 475911439 POINT (-122.6368884 47.7469547) PUGET SOUND ENERGY INC 5.303509e+10
WP0AD2A73G Snohomish Bothell WA 98012.0 2016 PORSCHE PANAMERA Plug-in Hybrid Electric Vehicle (PHEV) Not eligible due to low battery range 15.0 0.0 1.0 101971278 POINT (-122.206146 47.839957) PUGET SOUND ENERGY INC 5.306105e+10
5YJ3E1EB5J Kitsap Bremerton WA 98310.0 2018 TESLA MODEL 3 Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 215.0 0.0 23.0 474363746 POINT (-122.6231895 47.5930874) PUGET SOUND ENERGY INC 5.303508e+10
1N4AZ1CP3K King Redmond WA 98052.0 2019 NISSAN LEAF Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 150.0 0.0 45.0 476346482 POINT (-122.13158 47.67858) PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) 5.303303e+10
In [36]:
# 6. Rearrange/Count Columns

# Rearrange columns to focus on key data
df_vehicles_cleaned = df_vehicles_cleaned[['Make', 'Model', 'Electric Vehicle Type', 'Electric Range', 'Base MSRP', 'County']]

# Count occurrences of electric vehicle types
ev_type_counts = df_vehicles_cleaned['Electric Vehicle Type'].value_counts()
print(ev_type_counts)
Electric Vehicle Type
Battery Electric Vehicle (BEV)            165552
Plug-in Hybrid Electric Vehicle (PHEV)     44609
Name: count, dtype: int64
In [16]:
# 7. Summary Statistics (Max/Min/Median/Describe)

# Basic descriptive statistics
df_vehicles_cleaned.describe()

# Max and Min values
max_range = df_vehicles_cleaned['Electric Range'].max()
min_range = df_vehicles_cleaned['Electric Range'].min()
median_range = df_vehicles_cleaned['Electric Range'].median()

print(f"Max Electric Range: {max_range}, Min Electric Range: {min_range}, Median Electric Range: {median_range}")
Max Electric Range: 337.0, Min Electric Range: 0.0, Median Electric Range: 0.0
In [18]:
# 8. Visualization: Explore the Data
# 8.1 Electric Vehicle Range Distribution

# Plot the distribution of Electric Range
sns.histplot(df_vehicles_cleaned['Electric Range'], kde=True)
plt.title('Distribution of Electric Vehicle Range')
plt.xlabel('Electric Range')
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image
In [60]:
# 8.2 Count of Electric Vehicles by CIties

# Count the number of electric vehicles by Cities
county_counts = df_vehicles_cleaned['County'].value_counts()

# Plot the top counties with the most electric vehicles
county_counts.head(10).plot(kind='bar', figsize=(10,6), color='skyblue')
plt.title('Top 10 Cities by Electric Vehicle Registration')
plt.xlabel('County')
plt.ylabel('Number of Vehicles')
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [38]:
# 8.3 Bubble Chart of Electric Range vs MSRP

# Bubble chart showing Electric Range vs MSRP for different vehicle types
fig = px.scatter(df_vehicles_cleaned, x="Electric Range", y="Base MSRP", color="Electric Vehicle Type",
                 size="Electric Range", hover_name="Model", size_max=20, title="Electric Range vs MSRP")
fig.show()
In [42]:
# Convert 'Date' column to datetime format with automatic inference
df_population_history['Date'] = pd.to_datetime(df_population_history['Date'], errors='coerce')

# Check the first few rows to confirm the conversion
print(df_population_history['Date'].head())
0   2024-06-30
1   2017-12-31
2   2017-04-30
3   2022-02-28
4   2017-01-31
Name: Date, dtype: datetime64[ns]
In [46]:
# Check the unique values in the 'Date' column
print(df_population_history['Date'].unique())
<DatetimeArray>
['2024-06-30 00:00:00', '2017-12-31 00:00:00', '2017-04-30 00:00:00',
 '2022-02-28 00:00:00', '2017-01-31 00:00:00', '2019-07-31 00:00:00',
 '2019-03-31 00:00:00', '2024-07-31 00:00:00', '2021-03-31 00:00:00',
 '2022-08-31 00:00:00', '2024-04-30 00:00:00', '2020-06-30 00:00:00',
 '2018-05-31 00:00:00', '2022-04-30 00:00:00', '2020-04-30 00:00:00',
 '2017-11-30 00:00:00', '2024-02-29 00:00:00', '2019-08-31 00:00:00',
 '2023-07-31 00:00:00', '2022-03-31 00:00:00', '2020-11-30 00:00:00',
 '2019-06-30 00:00:00', '2019-10-31 00:00:00', '2019-12-31 00:00:00',
 '2024-05-31 00:00:00', '2022-06-30 00:00:00', '2021-05-31 00:00:00',
 '2022-10-31 00:00:00', '2023-02-28 00:00:00', '2023-01-31 00:00:00',
 '2023-06-30 00:00:00', '2022-07-31 00:00:00', '2024-01-31 00:00:00',
 '2022-11-30 00:00:00', '2023-11-30 00:00:00', '2023-04-30 00:00:00',
 '2018-11-30 00:00:00', '2021-12-31 00:00:00', '2019-02-28 00:00:00',
 '2018-10-31 00:00:00', '2020-08-31 00:00:00', '2017-09-30 00:00:00',
 '2022-01-31 00:00:00', '2022-05-31 00:00:00', '2021-01-31 00:00:00',
 '2024-03-31 00:00:00', '2018-03-31 00:00:00', '2020-12-31 00:00:00',
 '2018-08-31 00:00:00', '2023-10-31 00:00:00', '2020-07-31 00:00:00',
 '2020-03-31 00:00:00', '2020-09-30 00:00:00', '2020-02-29 00:00:00',
 '2020-01-31 00:00:00', '2018-04-30 00:00:00', '2017-08-31 00:00:00',
 '2021-10-31 00:00:00', '2018-02-28 00:00:00', '2023-05-31 00:00:00',
 '2024-09-30 00:00:00', '2020-05-31 00:00:00', '2018-06-30 00:00:00',
 '2023-12-31 00:00:00', '2023-08-31 00:00:00', '2017-06-30 00:00:00',
 '2018-01-31 00:00:00', '2021-09-30 00:00:00', '2021-02-28 00:00:00',
 '2024-08-31 00:00:00', '2017-05-31 00:00:00', '2022-09-30 00:00:00',
 '2023-09-30 00:00:00', '2021-11-30 00:00:00', '2018-09-30 00:00:00',
 '2017-02-28 00:00:00', '2019-01-31 00:00:00', '2023-03-31 00:00:00',
 '2018-12-31 00:00:00', '2019-09-30 00:00:00', '2021-06-30 00:00:00',
 '2022-12-31 00:00:00', '2020-10-31 00:00:00', '2017-07-31 00:00:00',
 '2018-07-31 00:00:00', '2019-05-31 00:00:00', '2021-07-31 00:00:00',
 '2021-08-31 00:00:00', '2017-10-31 00:00:00', '2019-04-30 00:00:00',
 '2021-04-30 00:00:00', '2017-03-31 00:00:00', '2019-11-30 00:00:00']
Length: 93, dtype: datetime64[ns]
In [50]:
# 9. Further Insights from Population History Dataset
# 9.1 Electric Vehicle Growth Over Time (Line Plot)

# Plot Electric Vehicle growth over time
plt.figure(figsize=(12, 6))
df_population_history.groupby('Date')['Electric Vehicle (EV) Total'].sum().plot(kind='line', color='green')
plt.title('Electric Vehicle Population Growth Over Time')
plt.xlabel('Year')
plt.ylabel('Total Electric Vehicles')
plt.show()
No description has been provided for this image